---
title: |
  | Pre-analysis Plans: An Early Stocktaking
  | Replication of Descriptive Analysis
author: |
  | George K. Ofosu and Daniel N. Posner
date: "`r format(Sys.Date(),'%B %d, %Y')`"
output:
  tufte::tufte_handout: 
    highlight: tango 
    df_print: kable
link-citations: yes
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
library(tufte)
# invalidate cache when the tufte version changes
knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tufte'))
options(htmltools.dir.version = FALSE, digits = 2)
library(formatR)
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=35),tidy=TRUE)
```



\newpage
```{r results = 'hide', message=FALSE,echo=FALSE, eval=TRUE, warning=FALSE}
# R version 4.0.1 (2020-06-06)
#install packages
#install.packages(c("memisc", "tidyverse"))

#load required libraries
library(tidyverse) # tidyverse_1.3.0
library(ggthemes)
library(scales) 
library(viridis)
library(gridExtra)


# set working directory
#setwd("")


#load dataset

pap_dat <- read_csv('master_pap_data08202019.csv')

pap_dat <- pap_dat%>%
  mutate(registry = factor(registry),
         publication = factor(publication),
         `type of study`=factor(`type of study`)
)

```


# 1.Hypotheses 
```{r}

hdata <- select(pap_dat, clearhyp:pointoutnew,primary_secondary_hyp_maintained)
#names(hdata)

stargazer::stargazer(as.data.frame(hdata), title="Hypotheses: clarity, quantity, and reporting",covariate.labels = c("PAP had clear hypotheses", 
"PAP specified more than one hypothesis", 
"Unclear whether PAP specified more than one hypothesis","# hypotheses specified"
,"# hypotheses was unclear",
"PAP distinguished primary verus secondary hypotheses", "# primary hypotheses",
"# primary hypotheses unclear", "# primary hypotheses presented in main body published paper","# primary hypotheses presented in main body of paper unclear","# primary hypotheses presented in the appendix of paper", "# primary hypotheses presented in the appendix of paper (revised)","# primary hypotheses presented in appendix unclear", "# primary hypotheses in article's main body supported", "Unclear: # hypotheses presented in main body of paper supported"
,"# primary hypotheses presented in the appendix supported", "# primary hypotheses presented in the appendix supported (revised)","Unclear: # of primary hypotheses presented in appendix supported", "# of secondary hypotheses in main body of the paper","# of secondary hypotheses in the appendix of paper","New hypotheses reported in published article","Author(s) acknowledge mew hypotheses", "Primary vs. secondary hypotheses distinction maintained in paper"),
                     type = "text",
                     summary.stat = c("n","mean","sd"))
```



# 2. Clarity of variable definition 

```{r, fig.width=10,fig.height=3, fig.fullwidth=TRUE}

vdata <- pap_dat %>%
  select(cleardepvar, changevar,describechangedepvar,seconddepvarsclear,clearindvar, changeindvar, describechangeindvar,ncontrols,ncontrols_unclear)
  
stargazer::stargazer(as.data.frame(vdata),type= "text", summary.stat = c("n","mean", "sd"), title = "Clarity of definition, changes, and reporting of variables",covariate.labels = c("Specified clear dependent variable (DV)","Changed primary DV in main paper", "Acknowledged change in DV", "Specified clear dependent variable (DV)","Specified clear treatment variable (IV)","Changed independent variable in paper", "Pointed out change in IV","# clear control variables", "# of control variables unclear"))
```

# 3. Sampling and power analysis

```{r, full.width=TRUE}

sdata <- pap_dat %>%
  select(popofinterest:poweranalysis)
  
stargazer::stargazer(as.data.frame(sdata),type= "text", summary.stat = c("n","mean", "sd"), title = "Clarity of definition, changes, and reporting of variables", covariate.labels = c("Clearly specified the population of interest", "Clearly specified the sampling frame", "Clearly specified the sampling strategy", "Specified the conditions under which to exclude units", "Use power analysis to justify sample size"))
```


# 4. Data collection

```{r, full.width=TRUE}

ddata <- pap_dat %>%
  select(treatmentundercontrol:imbalance)
  
stargazer::stargazer(as.data.frame(ddata),summary.stat = c("n","mean","sd"), type= "text", covariate.labels = c("Treatment assignment controlled by author(s)","Unclear whether author(s) controlled  treatment assignment", "Specified randomization procedure", "Specified manipulation checks (balance table)", "Specified how to deal with covariate imbalance"))
```


# 5. Inclusion and exclusion rules 

```{r, full.width=TRUE}

inexdata <- pap_dat %>%
  select(rulesmissing:follownoncomp)
  

stargazer::stargazer(as.data.frame(inexdata),summary.stat = c("n","mean", "sd"), type="text", covariate.labels = c(
  "Specified the rules dealing with missingness/attrition", 
  "Specified the rules for dealing with outliers", "Specified the rules for dealing with noncompliance","Followed rules for dealing with missingness and attrition","Followed rules for dealing with outliers", "Followed rules for dealing with noncompliance"))

```


# 6. Statistical model specification

```{r, fig.width=10,fig.height=3, fig.fullwidth=TRUE}

statdata <- pap_dat %>%
  select(statmodel:covars)%>%
  select(-statdevdescr)
  

stargazer::stargazer(as.data.frame(statdata),summary.stat = c("n","mean", "sd"), type="text", covariate.labels = c("Specified a clear statistical model","Deviated from specified statistical model", "Pointed out deviation from statistical model", "Specified how to estimate standard errors", "Specified how to adjust multiple hypotheses testing","Researcher commit to simple difference in means test", "Specified direction of hypotheses testing", "Specified whether and how control variables  will be included in analysis"))

```

# 7. Other features 

```{r, fig.width=10,fig.height=3, fig.fullwidth=TRUE}

otherdata <- pap_dat %>%
  select(updated:pages)
  
  
names(otherdata)


stargazer::stargazer(as.data.frame(otherdata),summary.stat = c("n","mean", "sd"), type="text",covariate.labels = c("Plan was updated", "If so, number of times updated", "Changes were clearly marked", "Study had IRB approval", "Study pre-specify how to deal with unexpected events (e.g., SOP)","Pages (in single-spaced pages)"))

```


# 8. Computations 

## a. new hypotheses restricted to whether hypotheses were clearly specified 

```{r}
mean(pap_dat$newhyp[pap_dat$clearhyp==1], na.rm = T)

mean(pap_dat$pointoutnew[pap_dat$clearhyp==1&pap_dat$newhyp==1], na.rm = T)

table(pap_dat$pointoutnew[pap_dat$clearhyp==1&pap_dat$newhyp==1])
```



## b. categorizing the number of hypotheses specified

```{r, fig.width=10,fig.height=3, fig.fullwidth=TRUE,  message=FALSE,echo=TRUE, eval=TRUE, warning=FALSE}
pap_dat <-mutate(pap_dat, 
      cat_nhyp=ifelse(howmanyhyp <=5, 1,
      ifelse(howmanyhyp> 5 & howmanyhyp <=10,2,
      ifelse(howmanyhyp > 10&howmanyhyp <=20, 3,
      ifelse(howmanyhyp> 20&howmanyhyp <=50,4,
      ifelse(howmanyhyp> 50,5, NA ))))), 
      cat_nhyp_dummy=ifelse(cat_nhyp==1,1,
      ifelse(cat_nhyp>1,2,NA)),
      cat_nprimaryhyp=ifelse(howmanyprimary <=5, 1,
      ifelse(howmanyprimary> 5&howmanyprimary <=10,2,
    ifelse(howmanyprimary> 10&howmanyprimary <=20, 3,
    ifelse(howmanyprimary> 20&howmanyprimary <=50,4,
  ifelse(howmanyprimary> 50,5,NA))))),
  pages_cat =ifelse(pages <= 5, 1, 
  ifelse(pages>5 &pages <=10,2, 
  ifelse(pages>10 &pages <=15,3,ifelse(pages >15 & pages <=20, 4, 
        ifelse(pages >20 & pages <=25, 5, 
        ifelse(pages >25 & pages <=30,6,
        ifelse(pages >30 & pages <= 40,7,
        ifelse(pages >40,8, NA) )))))           
))
)

#prop.table(table(pap_dat$cat_nhyp))

#tab1 <- table(pap_dat$cat_nhyp)

#number of hypotheses



p1 <- ggplot(filter(pap_dat,!is.na(cat_nhyp)),
             aes(x=factor(cat_nhyp),group=1))+
  geom_bar(aes(y = (..count..)/sum(..count..)))+
  scale_y_continuous(labels=scales::percent, limits = c(0,0.45)) +
  scale_x_discrete(labels=c("1-5","6-10","11-20","21-50","50+"))+
  geom_text(aes( label = str_c("(",..count..,")",sep = ""),y= ..prop.. ), 
            stat= "count", vjust = -.5,nudge_x =-.1,size=6) +
  geom_text(aes( label = scales::percent(..prop..),y= ..prop.. ), 
            stat= "count", vjust = -.5,nudge_x =.3 ,size=6) +
  labs(x="Number of hypotheses specified",y="Percentage of studies",title = "Panel A")+
  theme_tufte()+
  theme(axis.title = element_text(face = "bold",size = 24,colour = "black"),
        legend.title = element_text(face = "bold"),
        legend.key =element_rect(fill = "white",color = "white"),
        axis.ticks = element_line(size = 2),
        panel.background =element_blank(),
        axis.line = element_line(size = 1, linetype = "solid"),
        title = element_text(family = "serif",size=13,colour = "black",face = "bold"), 
        axis.text.y = element_text(family = "serif",size=18,colour = "black",face = "bold"),
        axis.text.x = element_text(family = "serif",size=18,colour = "black",face = "bold"),
        strip.text = element_text(size=12, face="bold"),
        legend.text =element_text(family = "serif",size=8,colour = "black"),
        panel.grid.minor = element_line(colour="grey", size=0.5))



p2 <- ggplot(filter(pap_dat,!is.na(cat_nprimaryhyp) & cat_nhyp_dummy==2 & primary==1),aes(x=factor(cat_nprimaryhyp),group=1))+
  geom_bar(aes(y = (..count..)/sum(..count..)))+
  scale_y_continuous(labels=scales::percent,limits = c(0,0.45)) +
  scale_x_discrete(labels=c("1-5","6-10","11-20","21-50","50+"))+
  geom_text(aes( label = str_c("(",..count..,")",sep = ""),y= ..prop.. ), stat= "count", vjust = -.5,nudge_x =-.1,size=6) +
  geom_text(aes( label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.5,nudge_x =.3 ,size=6) +
  labs(x="Number of primary hypotheses specified",y="Studies with more than 5 hypotheses that distinguished \n primary from secondary hypotheses",title = "Panel B")+
  theme_tufte()+
  theme(axis.title = element_text(face = "bold",size = 24,colour = "black"),
        legend.title = element_text(face = "bold"),
        legend.key =element_rect(fill = "white",color = "white"),
        axis.ticks = element_line(size = 2),
        panel.background =element_blank(),
        axis.line = element_line(size = 1, linetype = "solid"),
        title = element_text(family = "serif",size=13,colour = "black",face = "bold"), 
        axis.text.y = element_text(family = "serif",size=18,colour = "black",face = "bold"),
        axis.text.x = element_text(family = "serif",size=18,colour = "black",face = "bold"),
        strip.text = element_text(size=12, face="bold"),
        legend.text =element_text(family = "serif",size=8,colour = "black"),
        panel.grid.minor = element_line(colour="grey", size=0.5))





#ggsave(filename = "num_hyp_twoparts.pdf",arrangeGrob(p1, p2,nrow = 1,ncol = 2), width=20,height = 12)




```

```{r, fig.width=10,fig.height=8, fig.fullwidth=TRUE,  message=FALSE,echo=TRUE, eval=TRUE, warning=FALSE}
p1

```

```{r, fig.width=10,fig.height=8, fig.fullwidth=TRUE,  message=FALSE,echo=TRUE, eval=TRUE, warning=FALSE}
p2
```



## c. Proportion that distinguish between primary and secondary hypotheses by number of hypotheses 

```{r}


## number of specified hypotheses by whether author distinguished primary vs. secondary hypotheses

prop.table(table(pap_dat$cat_nhyp,pap_dat$primary),1)

prop.table(table(pap_dat$cat_nhyp_dummy,pap_dat$primary),1)

## 
summary(pap_dat$howmanyprimary[pap_dat$cat_nhyp_dummy==2&pap_dat$primary==1])



tab2 <- table(pap_dat$cat_nprimaryhyp[pap_dat$cat_nhyp_dummy==2&pap_dat$primary==1])

prop.table(table(pap_dat$cat_nprimaryhyp[pap_dat$cat_nhyp_dummy==2&pap_dat$primary==1]))

#stargazer::stargazer(as.matrix(tab2))
```

## d. Proportion of researchers who distinquished primary and secondary hypotheses who maintained such distinction in the resulting paper

```{r}
## restricted to PAPs specifying more than 5 primary hypotheses
prop.table(table(pap_dat$primary_secondary_hyp_maintained[pap_dat$cat_nhyp_dummy==2&pap_dat$primary==1]))

### proportion who maintained distinction in main paper
prop.table(table(pap_dat$primary_secondary_hyp_maintained))
```


## f. multiple hypotheses testing among PAPs that specify more that 5 hypotheses

```{r}
prop.table(table(pap_dat$cat_nhyp_dummy,pap_dat$multipletest),1)
```

## g. Reporting of specified hypotheses

```{r, fig.width=10,fig.height=8, fig.fullwidth=TRUE,  message=FALSE,echo=TRUE, eval=TRUE, warning=FALSE}

pap_dat <-mutate(pap_dat,
  presentedprimaryappendix_rev0=ifelse(is.na(presentedprimaryappendix),0,presentedprimaryappendix),
    primaryreported_perc=presentedpimarymainbody/howmanyprimary, 
  primaryreported_perc_dum=ifelse(primaryreported_perc <1,0,
  ifelse(primaryreported_perc ==1, 1, NA)),
  primarymain_appendix=presentedpimarymainbody+presentedprimaryappendix_rev0,
    total_primary_reported=
    ifelse(primarymain_appendix==howmanyprimary,1,
    ifelse(primarymain_appendix<howmanyprimary,0,NA)),
    howmanyprimary_rev=ifelse(primary==0 &is.na(howmanyprimary),howmanyhyp,howmanyprimary),
     primaryreported_perc2=
    presentedpimarymainbody/howmanyprimary_rev, primaryreported_perc_dum2=
    ifelse(primaryreported_perc2 <1,0,
    ifelse(primaryreported_perc2 ==1, 1, NA)),
  primarymain_appendix2=
    presentedpimarymainbody
  +presentedprimaryappendix_rev0,
  total_primary_reported2=
    ifelse(primarymain_appendix==howmanyprimary_rev,1,
    ifelse(primarymain_appendix<howmanyprimary_rev,0,NA)),
  presentedpimarymainbody_rev=
    ifelse(is.na(presentedpimarymainbody),0,
           presentedpimarymainbody),
   presentedsecondarymainbody_rev=
    ifelse(is.na(presentedsecondarymainbody), 0,presentedsecondarymainbody),
 presentedsecondaryappendix_rev=
   ifelse(is.na(presentedsecondaryappendix), 0,presentedsecondaryappendix),
 total_hyp_reported=presentedpimarymainbody_rev
 +presentedprimaryappendix_rev0
 +presentedsecondarymainbody_rev
 +presentedsecondaryappendix_rev,
  total_hyp_reported_dum=ifelse(total_hyp_reported==howmanyhyp,1,
  ifelse(total_hyp_reported<howmanyhyp,0,NA)),
 prop_reported=ifelse(publication=="PAP with publication",total_hyp_reported/howmanyhyp,NA)
)

## proportion of papers reporting primary hypotheses reported in main paper (excluding appendix)
prop.table(table(pap_dat$primaryreported_perc_dum[pap_dat$publication=="PAP with publication"]))

## proportion of papers reporting all pre-specified primary  hypotheses in main and appendix of paper
prop.table(table(pap_dat$total_primary_reported2[pap_dat$publication=="PAP with publication"]))

## Proportion of paper reporting less than pre-specified total hypotheses 
prop.table(table(pap_dat$total_hyp_reported_dum[pap_dat$publication=="PAP with publication"]))

## Median proportion of total pre-specified hypotheses reported

median(pap_dat$prop_reported,na.rm = T)

```



## Number and share of PAPs satisfying the four key requirements of a complete 

```{r, fig.width=15,fig.height=12, fig.fullwidth=TRUE,  message=FALSE,echo=FALSE, eval=TRUE, warning=FALSE}

pap_dat <-mutate(pap_dat, perf_pap =rowSums(cbind(clearhyp,cleardepvar,clearindvar,statmodel), na.rm=T),period =as_factor(ifelse(as.numeric(year)<2016, "2011-2015","2016")) )

table(pap_dat$period)


### Generate plot of 
ggplot(pap_dat,aes(x=factor(perf_pap),group=1))+
  geom_bar(aes(y = (..count..)/sum(..count..)))+
  scale_y_continuous(labels=scales::percent,limits = c(0,0.6)) +
  #scale_x_discrete(labels=c("1-5","6-10","11-20","21-50","50+"))+
  geom_text(aes( label = str_c("(",..count..,")",sep = ""),y= ..prop.. ), stat= "count", vjust = -.5,nudge_x =-.1,size=6) +
  geom_text(aes( label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.5,nudge_x =.3 ,size=6) +
  labs(x="Number out of four key requirements for a precise analysis plan satisfied",y="Percentage of studies",title = "")+
  theme_tufte()+
  theme(axis.title = element_text(face = "bold",size = 24,colour = "black"),
        legend.title = element_text(face = "bold"),
        legend.key =element_rect(fill = "white",color = "white"),
        axis.ticks = element_line(size = 2),
        panel.background =element_blank(),
        axis.line = element_line(size = 1, linetype = "solid"),
        title = element_text(family = "serif",size=13,colour = "black",face = "bold"), 
        axis.text.y = element_text(family = "serif",size=18,colour = "black",face = "bold"),
        axis.text.x = element_text(family = "serif",size=18,colour = "black",face = "bold"),
        strip.text = element_text(size=12, face="bold"),
        legend.text =element_text(family = "serif",size=8,colour = "black"),
        panel.grid.minor = element_line(colour="grey", size=0.5))

#ggsave(filename = "precise_pap.pdf", width=20,height = 12)
```



## i.number of pages

```{r, fig.width=20,fig.height=14, fig.fullwidth=TRUE, message=FALSE,echo=FALSE, eval=TRUE, warning=FALSE}
table(pap_dat$pages_cat)

ggplot(filter(pap_dat,!is.na(pages_cat)),aes(x=factor(pages_cat),group=1))+
  geom_bar(aes(y = (..count..)/sum(..count..)))+
  scale_y_continuous(labels=scales::percent) +
  scale_x_discrete(labels=c("1-5","6-10","11-15","16-20","21-25", "26-30", "31-40", "41+"))+
  geom_text(aes( label = str_c("(",..count..,")",sep = ""),y= ..prop.. ), stat= "count", vjust = -.5,nudge_x =-.1,size=6) +
  geom_text(aes( label = scales::percent(..prop..),y= ..prop.. ), stat= "count", vjust = -.5,nudge_x =.3 ,size=6) +
  labs(x="Number of pages of pre-analysis plan (single spaced)",y="Percentage of studies")+
theme_tufte()+
  theme(axis.title = element_text(face = "bold",size = 24,colour = "black"),
        legend.title = element_text(face = "bold"),
        legend.key =element_rect(fill = "white",color = "white"),
        axis.ticks = element_line(size = 2),
        panel.background =element_blank(),
        axis.line = element_line(size = 1, linetype = "solid"),
        title = element_text(family = "serif",size=13,colour = "black",face = "bold"), 
        axis.text.y = element_text(family = "serif",size=18,colour = "black",face = "bold"),
        axis.text.x = element_text(family = "serif",size=18,colour = "black",face = "bold"),
        strip.text = element_text(size=12, face="bold"),
        legend.text =element_text(family = "serif",size=8,colour = "black"),
        panel.grid.minor = element_line(colour="grey", size=0.5))


#ggsave("fig_pages.pdf")
```











